In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk
%matplotlib inline
In [2]:
from pymatgen import Composition
In [3]:
old_icsd=pd.read_csv("../ICSD/icsd-ternaries.csv",sep='\t',names=["Entry Number","Space Group","Structure","Structure Type","Description","Authors","Reference"],header=None)
old_icsd.head()
Out[3]:
In [4]:
old_icsd["Structure Type"].value_counts()
Out[4]:
In [5]:
Structures=[Composition(j).formula for j in old_icsd["Structure"].values]
In [6]:
old_icsd["Structures"]=Structures
In [7]:
import fingerprint as fp
struct_all=s_all=fp.read_pickle("struct_all.pickle")
In [8]:
all_comps=[x.composition.formula for x in struct_all]
In [9]:
def find_overlap(struct_type):
lis=old_icsd[old_icsd["Structure Type"]==struct_type]["Structures"]
unique_lis=[str(x) for x in np.unique(lis)]
overlap=[x for x in unique_lis if x in all_comps]
print "{} matches found for {}".format(len(overlap),struct_type)
return overlap
In [10]:
overlap_GdFeO3=find_overlap("Perovskite-GdFeO3")
overlap_122=find_overlap("ThCr2Si2")
overlap_CaTiO3=find_overlap("Perovskite-CaTiO3")
overlap_NaCl=find_overlap("NaCl")
overlap_spinel=find_overlap("Spinel-Al2MgO4")
overlap_delaf=find_overlap("Delafossite-NaCrS2")
In [11]:
def matching_struct(comp_list):
return[x for x in struct_all if x.composition.formula in comp_list]
In [12]:
GdFeO3_structs=matching_struct(overlap_GdFeO3)
one22_structs=matching_struct(overlap_122)
CaTiO3_structs=matching_struct(overlap_CaTiO3)
NaCl_structs=matching_struct(overlap_NaCl)
Spinel_structs=matching_struct(overlap_spinel)
delaf_structs=matching_struct(overlap_delaf)
In [13]:
import itertools
matching_structs=[]
matching_structs.extend(itertools.chain(one22_structs,CaTiO3_structs,Spinel_structs,delaf_structs))
len(matching_structs)
Out[13]:
In [15]:
dict_structs={}
dict_structs.update({x:0 for x in overlap_delaf})
dict_structs.update({x:1 for x in overlap_CaTiO3})
dict_structs.update({x:2 for x in overlap_122})
dict_structs.update({x:3 for x in overlap_spinel})
#dict_structs.update({x:4 for x in overlap_NaCl})
print len(dict_structs)
In [15]:
import tqdm
In [16]:
f_ones=[fp.get_phi(matching_structs[i],obser="ones",rmax=10,delta=0.05) for i in tqdm.tqdm_notebook(range(len(matching_structs)))]
f_Z=[fp.get_phi(matching_structs[i],obser="Z",rmax=10,delta=0.05) for i in tqdm.tqdm_notebook(range(len(matching_structs)))]
f_Chi=[fp.get_phi(matching_structs[i],obser="Chi",rmax=10,delta=0.05) for i in tqdm.tqdm_notebook(range(len(matching_structs)))]
Structures=[i.composition.formula for i in matching_structs]
labels=[dict_structs[i] for i in Structures]
In [17]:
fingerprints=np.array([list(itertools.chain(f_ones[i],f_Z[i],f_Chi[i])) for i in range(len(f_ones))])
In [18]:
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix
from sklearn.metrics import adjusted_rand_score
In [19]:
Km=KMeans(n_clusters=4,n_init=250)
clust_km=Km.fit_predict(fingerprints)
print confusion_matrix(labels,clust_km)
print adjusted_rand_score(labels,clust_km)
In [20]:
from sklearn.cluster import AgglomerativeClustering
In [21]:
Ag=AgglomerativeClustering(n_clusters=4)
In [22]:
Ag=AgglomerativeClustering(n_clusters=4)
clust_ag=Ag.fit_predict(fingerprints)
print confusion_matrix(labels,clust_ag)
print adjusted_rand_score(labels,clust_ag)
In [23]:
from numpy import random
r=random.RandomState(42)
perm_state=r.permutation(len(matching_structs))
labels=np.array(labels)
labels_perm=labels[perm_state]
fingerprints_perm=fingerprints[perm_state]
In [24]:
from sklearn.svm import SVC
In [25]:
sv=SVC(random_state=42)
sv.fit(fingerprints_perm,labels_perm)
clust_svc=sv.predict(fingerprints_perm)
In [26]:
print confusion_matrix(labels_perm,clust_svc)
print adjusted_rand_score(labels_perm,clust_svc)
In [27]:
from sklearn.svm import LinearSVC
lsv=LinearSVC(random_state=42)
In [28]:
lsv.fit(fingerprints_perm,labels_perm)
Out[28]:
In [29]:
clust_lsv=lsv.predict(fingerprints_perm)
print confusion_matrix(labels_perm,clust_lsv)
print adjusted_rand_score(labels_perm,clust_lsv)
In [30]:
from sklearn.linear_model import LogisticRegression
In [31]:
logist=LogisticRegression()
logist.fit(fingerprints_perm,labels_perm)
clust_logist=logist.predict(fingerprints_perm)
print confusion_matrix(labels_perm,clust_logist)
print adjusted_rand_score(labels_perm,clust_logist)
In [32]:
ones_perm=np.array(f_ones)[perm_state]
Z_perm=np.array(f_Z)[perm_state]
Chi_perm=np.array(f_Chi)[perm_state]
In [33]:
logist2=LogisticRegression()
logist2.fit(ones_perm,labels_perm)
clust_logist2=logist2.predict(ones_perm)
print confusion_matrix(labels_perm,clust_logist2)
print adjusted_rand_score(labels_perm,clust_logist2)
In [34]:
logist3=LogisticRegression()
logist3.fit(Z_perm,labels_perm)
clust_logist3=logist3.predict(Z_perm)
print confusion_matrix(labels_perm,clust_logist3)
print adjusted_rand_score(labels_perm,clust_logist3)
In [35]:
logist4=LogisticRegression()
logist4.fit(Chi_perm,labels_perm)
clust_logist4=logist4.predict(Chi_perm)
print confusion_matrix(labels_perm,clust_logist4)
print adjusted_rand_score(labels_perm,clust_logist4)
In [36]:
from sklearn.decomposition import PCA
In [37]:
label_names=["Delaffosite","Perovskite","122","Spinel"]
In [38]:
label_names=np.array(["Delaffosite","Perovskite","122","Spinel"])
c_arr=['r','g','y','b']
In [39]:
pca=PCA(n_components=2)
pca_fingerprint=pca.fit_transform(fingerprints_perm)
plt.figure(figsize=(10,10))
for i in range(4):
plt.scatter(pca_fingerprint[(labels_perm==i),0],pca_fingerprint[(labels_perm==i),1],c=c_arr[i],label=label_names[i])
plt.legend()
Out[39]:
In [40]:
from sklearn.manifold import TSNE
In [41]:
ts=TSNE(n_components=2,random_state=42)
tsne_fingerprints=ts.fit_transform(fingerprints_perm)
plt.figure(figsize=(10,10))
for i in range(4):
plt.scatter(tsne_fingerprints[(labels_perm==i),0],tsne_fingerprints[(labels_perm==i),1],c=c_arr[i],label=label_names[i])
plt.legend()
Out[41]:
In [42]:
ts2=TSNE(n_components=2,random_state=42)
tsne_fingerprints=ts2.fit_transform(fingerprints_perm)
pca=PCA(n_components=5)
pca_fingerprint=pca.fit_transform(fingerprints_perm)
combined_finger=np.hstack((pca_fingerprint,tsne_fingerprints))
In [43]:
combined_finger.shape
Out[43]:
In [44]:
sv2=SVC(random_state=42)
sv2.fit(combined_finger,labels_perm)
clust_svc_projected=sv2.predict(combined_finger)
print confusion_matrix(labels_perm,clust_svc_projected)
print adjusted_rand_score(labels_perm,clust_svc_projected)
In [45]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
combined_finger_scaled=scaler.fit_transform(combined_finger)
In [46]:
combined_finger_scaled.shape
Out[46]:
In [47]:
Km2=KMeans(n_clusters=4,n_init=250)
clust_km_projected=Km2.fit_predict(combined_finger_scaled)
print confusion_matrix(labels_perm,clust_km_projected)
print adjusted_rand_score(labels_perm,clust_km_projected)
In [48]:
euclid=np.array([[np.sqrt(np.dot(fingerprints[i],fingerprints[i])+np.dot(fingerprints[j],fingerprints[j])-2*np.dot(fingerprints[i],fingerprints[j])) for i in range(len(fingerprints))] for j in range(len(fingerprints))])
plt.figure(figsize=(12,12))
plt.imshow(euclid)
plt.colorbar()
Out[48]:
In [341]:
Counter(labels)
Out[341]:
In [49]:
from sklearn.metrics.pairwise import euclidean_distances
In [54]:
#fig,axes=plt.subplots(2,4,figsize=(20,12))
for i in range(100):
# ax=axes.flat[i]
pca=PCA(n_components=2+i)
pca_fingerprint=pca.fit_transform(fingerprints)
print "Covariance ratio for {} components={}".format(2+i,np.sum(pca.explained_variance_ratio_))
# dist_pca=euclidean_distances(pca_fingerprint)
# im=ax.imshow(dist_pca)
#ax.set_title("N_components={}".format(2+i),fontsize=25)
#cax = fig.add_axes([0.95, 0.1, 0.03, 0.85])
#fig.colorbar(im,cax=cax)
In [58]:
[x for x in overlap_CaTiO3 if "Ti" in x]
Out[58]:
In [3]:
BaTiO3_struct=matching_structs(['Ba1 Ti1 O3'])[0]
In [2]:
def matching_structs(comp_list):
return[x for x in struct_all if x.composition.formula in comp_list]
In [4]:
print BaTiO3_struct
In [64]:
[x for x in overlap_CaTiO3 ]
Out[64]:
In [42]:
LaCoO3_struct=matching_struct(['La1 Co1 O3'])[0]
In [ ]:
LaCoO3_struct.
In [6]:
print LaCoO3_struct
In [7]:
st_list=[BaTiO3_struct,LaCoO3_struct]
st_list
Out[7]:
In [18]:
from sklearn.metrics.pairwise import euclidean_distances
In [33]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
f_small_ones=[fp.get_phi(st_list[i],obser='ones',rmax=10,delta=0.05) for i in (0,1)]
plt.figure(figsize=(10,10))
labels=['BaTiO3','LaCoO3']
for i in range(2):
plt.plot(f_small_ones[i],label=labels[i])
plt.legend()
plt.grid()
print euclidean_distances(f_small_ones[0],f_small_ones[1])
In [86]:
reload(fp)
Out[86]:
In [32]:
reload(fp)
f_small_ones=[fp.get_phi_scaled(st_list[i],obser='ones',debug=True,n_bins=200) for i in (0,1)]
plt.figure(figsize=(10,10))
labels=['BaTiO3','LaCoO3']
for i in range(2):
plt.plot(f_small_ones[i],label=labels[i])
plt.legend()
plt.grid()
print (np.dot(f_small_ones[0],f_small_ones[0])+np.dot(f_small_ones[1],f_small_ones[1])-2*np.dot(f_small_ones[0],f_small_ones[1]))
print euclidean_distances(f_small_ones[0],f_small_ones[1])
In [34]:
plt.plot(f_small_ones[0]-f_small_ones[1])
Out[34]:
In [ ]:
import tqdm
reload(fp)
f_ones_scaled=[fp.get_phi_scaled(matching_structs[i],obser="ones",delta=0.05) for i in tqdm.tqdm_notebook(range(len(matching_structs)))]
f_Z_scaled=[fp.get_phi_scaled(matching_structs[i],obser="Z",delta=0.05) for i in tqdm.tqdm_notebook(range(len(matching_structs)))]
f_Chi_scaled=[fp.get_phi_scaled(matching_structs[i],obser="Chi",delta=0.05) for i in tqdm.tqdm_notebook(range(len(matching_structs)))]
In [16]:
Structures_scaled=[i.composition.formula for i in matching_structs]
labels_scaled=[dict_structs[i] for i in Structures_scaled]
In [19]:
fingerprints_scaled=np.array([list(itertools.chain(f_ones_scaled[i],f_Z_scaled[i],f_Chi_scaled[i])) for i in range(len(f_ones_scaled))])
euclid_scaled=euclidean_distances(fingerprints_scaled)
plt.figure(figsize=(12,12))
plt.imshow(euclid_scaled)
plt.colorbar()
Out[19]:
In [20]:
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix
from sklearn.metrics import adjusted_rand_score
Km=KMeans(n_clusters=4,n_init=250)
clust_km=Km.fit_predict(fingerprints_scaled)
print confusion_matrix(labels_scaled,clust_km)
print adjusted_rand_score(labels_scaled,clust_km)
In [21]:
from sklearn.cluster import AgglomerativeClustering
Ag=AgglomerativeClustering(n_clusters=4)
clust_ag=Ag.fit_predict(fingerprints_scaled)
print confusion_matrix(labels_scaled,clust_ag)
print adjusted_rand_score(labels_scaled,clust_ag)
In [22]:
not_spinel=np.array(labels_scaled)!=3
labels_scaled_new=np.array(labels_scaled)[not_spinel]
fingerprints_scaled_new=fingerprints_scaled[not_spinel]
len(labels_scaled_new),len(fingerprints_scaled_new)
Out[22]:
In [23]:
Km=KMeans(n_clusters=3,n_init=250)
clust_km=Km.fit_predict(fingerprints_scaled_new)
print confusion_matrix(labels_scaled_new,clust_km)
print adjusted_rand_score(labels_scaled_new,clust_km)
In [24]:
Ag=AgglomerativeClustering(n_clusters=3)
clust_ag=Ag.fit_predict(fingerprints_scaled_new)
print confusion_matrix(labels_scaled_new,clust_ag)
print "Rand-score=",adjusted_rand_score(labels_scaled_new,clust_ag)
In [25]:
from sklearn.decomposition import PCA
label_names=np.array(["Delaffosite","Perovskite","122","Spinel"])
c_arr=['r','g','y','b']
In [97]:
In [26]:
labels_scaled=np.array(labels_scaled)
In [39]:
pca=PCA(n_components=5)
pca_fingerprint=pca.fit_transform(fingerprints_scaled)
plt.figure(figsize=(10,10))
for i in range(4):
plt.scatter(pca_fingerprint[(labels_scaled==i),0],pca_fingerprint[(labels_scaled==i),1],c=c_arr[i],label=label_names[i])
plt.legend()
Out[39]:
In [33]:
from sklearn.manifold import TSNE
In [34]:
ts=TSNE(n_components=2,random_state=42)
ts_fingerprint=ts.fit_transform(fingerprints_scaled)
plt.figure(figsize=(10,10))
for i in range(4):
plt.scatter(ts_fingerprint[(labels_scaled==i),0],ts_fingerprint[(labels_scaled==i),1],c=c_arr[i],label=label_names[i])
plt.legend()
Out[34]:
In [129]:
n_comp=100
cov=np.zeros(n_comp)
pca=PCA(n_components=n_comp)
pca_fingerprint=pca.fit_transform(fingerprints_scaled)
cov=np.cumsum(pca.explained_variance_ratio_)
plt.figure(figsize=(10,10))
plt.grid()
plt.xlabel("Number of components")
plt.ylabel("Covariance Explained")
plt.plot(np.arange(1,n_comp+1),cov,'ro')
Out[129]:
In [35]:
pca=PCA(n_components=50)
pca_fingerprint_50=pca.fit_transform(fingerprints_scaled)
In [36]:
Km=KMeans(n_clusters=4,n_init=250)
clust_km_pca=Km.fit_predict(pca_fingerprint_50)
print confusion_matrix(labels_scaled,clust_km_pca)
print adjusted_rand_score(labels_scaled,clust_km_pca)
In [37]:
Ag=AgglomerativeClustering(n_clusters=4)
clust_ag_pca=Ag.fit_predict(pca_fingerprint_50)
print confusion_matrix(labels_scaled,clust_ag_pca)
print adjusted_rand_score(labels_scaled,clust_ag_pca)
In [159]:
comp_lim=100
pca=PCA(n_components=comp_lim)
accu=np.zeros(comp_lim)
x_temp=np.arange(1,comp_lim+1)
pca_fingerprint_running=pca.fit_transform(fingerprints_scaled)
for i in range(1,comp_lim+1):
fing=pca_fingerprint_running[:,0:i]
Ag=AgglomerativeClustering(n_clusters=4)
clust_ag_running=Ag.fit_predict(fing)
accu[i-1]=adjusted_rand_score(labels_scaled,clust_ag_running)
plt.figure(figsize=(10,10))
plt.plot(x_temp,accu,'ro-')
print "Maximum score obtained for n_comp={}, max score={}".format(x_temp[np.argmax(accu)],np.amax(accu))
In [147]:
plt.figure(figsize=(10,10))
plt.plot(x_temp[4:],accu[4:],'ro-')
print "Maximum score obtained for n_comp={}, max score={}".format(np.argmax(accu),np.amax(accu))
In [38]:
pca=PCA(n_components=6)
pca_fingerprint_6=pca.fit_transform(fingerprints_scaled)
Ag=AgglomerativeClustering(n_clusters=4)
clust_ag_pca6=Ag.fit_predict(pca_fingerprint_6)
print confusion_matrix(labels_scaled,clust_ag_pca6)
print adjusted_rand_score(labels_scaled,clust_ag_pca6)
In [40]:
import tqdm
reload(fp)
fall_ones_scaled=[fp.get_phi_scaled(struct_all[i],obser="ones",delta=0.05) for i in tqdm.tqdm_notebook(range(len(struct_all)))]
#fall_Z_scaled=[fp.get_phi_scaled(matching_structs[i],obser="Z",delta=0.05) for i in tqdm.tqdm_notebook(range(len(matching_structs)))]
#fall_Chi_scaled=[fp.get_phi_scaled(matching_structs[i],obser="Chi",delta=0.05) for i in tqdm.tqdm_notebook(range(len(matching_structs)))]
In [43]:
num_atoms=[len(x.species) for x in struct_all]
In [45]:
from collections import Counter
num_counts=Counter(num_atoms)
Out[45]:
In [46]:
num_array=np.zeros(265)
for i in range(3,265):
if num_counts.has_key(i):
num_array[i]=num_counts[i]
In [47]:
num_array
Out[47]:
In [52]:
list(enumerate(np.cumsum(num_array)))
Out[52]:
In [53]:
structs_lim_50=[x for x in struct_all if len(x.species)<50]
In [55]:
structs_88=[x for x in struct_all if len(x.species)==88]
In [56]:
fomulae_88=[x.composition.formula for x in structs_88]
In [57]:
fomulae_88
Out[57]:
In [58]:
finger_pyro=[fp.get_phi_scaled(structs_88[i],obser='ones') for i in tqdm.tqdm_notebook(range(len(structs_88)))]
In [59]:
euclid_scaled=euclidean_distances(finger_pyro)
plt.figure(figsize=(12,12))
plt.imshow(euclid_scaled)
plt.colorbar()
Out[59]:
In [60]:
fall_ones_scaled=[fp.get_phi_scaled(structs_lim_50[i],obser="ones",delta=0.05) for i in tqdm.tqdm_notebook(range(len(structs_lim_50)))]
In [69]:
def phi_getter(i):
phi_ones=fp.get_phi_scaled(i,obser='ones')
phi_Z=fp.get_phi_scaled(i,obser='Z')
phi_Chi=fp.get_phi_scaled(i,obser='Chi')
return list(itertools.chain(phi_ones,phi_Z,phi_Chi))
In [ ]:
from multiprocessing import Pool
p=Pool(4)
finger_lt50=np.array(p.map(phi_getter,structs_lim_50))
finger_lt50.shape
In [74]:
finger_lt50.shape
Out[74]:
In [ ]: